In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Load Building Permit dataset for Chicago¶

In [ ]:
chicago_df=pd.read_csv('../src/data/Building_Permits.csv')
C:\Users\44742\AppData\Local\Temp\ipykernel_245908\1767199503.py:1: DtypeWarning: Columns (1,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98) have mixed types. Specify dtype option on import or set low_memory=False.
  chicago_df=pd.read_csv('../src/data/Building_Permits.csv')
In [ ]:
chicago_df.shape
Out[ ]:
(730511, 119)
In [ ]:
chicago_df.iloc[:,50:70].sample(3)
Out[ ]:
CONTACT_6_TYPE CONTACT_6_NAME CONTACT_6_CITY CONTACT_6_STATE CONTACT_6_ZIPCODE CONTACT_7_TYPE CONTACT_7_NAME CONTACT_7_CITY CONTACT_7_STATE CONTACT_7_ZIPCODE CONTACT_8_TYPE CONTACT_8_NAME CONTACT_8_CITY CONTACT_8_STATE CONTACT_8_ZIPCODE CONTACT_9_TYPE CONTACT_9_NAME CONTACT_9_CITY CONTACT_9_STATE CONTACT_9_ZIPCODE
665460 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
424187 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
17018 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

We have a lot of nulls¶

Let's drop columns where the number of nulls is 15% or more

In [ ]:
#Calculate the percentage of null rows for each column
col_nulls=chicago_df.isna().sum()*100/chicago_df.shape[0]
#Valid columns (where the number of nulls is less than 15%)
col_valid=col_nulls[col_nulls<15].sort_values(ascending=False)
In [ ]:
#Non valid columns (where the number of nulls is 15% or more)
col_non_valid=col_nulls[col_nulls>=15].sort_values(ascending=False)
col_non_valid.index
Out[ ]:
Index(['CONTACT_15_TYPE', 'CONTACT_15_ZIPCODE', 'CONTACT_15_STATE',
       'CONTACT_15_CITY', 'CONTACT_15_NAME', 'CONTACT_14_STATE',
       'CONTACT_14_TYPE', 'CONTACT_14_NAME', 'CONTACT_14_CITY',
       'CONTACT_14_ZIPCODE', 'CONTACT_13_TYPE', 'CONTACT_13_NAME',
       'CONTACT_13_CITY', 'CONTACT_13_STATE', 'CONTACT_13_ZIPCODE',
       'CONTACT_12_ZIPCODE', 'CONTACT_12_STATE', 'CONTACT_12_CITY',
       'CONTACT_12_TYPE', 'CONTACT_12_NAME', 'CONTACT_11_ZIPCODE',
       'CONTACT_11_STATE', 'CONTACT_11_CITY', 'CONTACT_11_NAME',
       'CONTACT_11_TYPE', 'PIN10', 'PIN9', 'PIN8', 'CONTACT_10_ZIPCODE',
       'CONTACT_10_STATE', 'PIN7', 'CONTACT_10_CITY', 'CONTACT_10_TYPE',
       'CONTACT_10_NAME', 'PIN6', 'PIN5', 'PIN4', 'CONTACT_9_ZIPCODE',
       'CONTACT_9_STATE', 'CONTACT_9_CITY', 'CONTACT_9_TYPE', 'CONTACT_9_NAME',
       'PIN3', 'CONTACT_8_ZIPCODE', 'CONTACT_8_STATE', 'CONTACT_8_CITY',
       'CONTACT_8_NAME', 'CONTACT_8_TYPE', 'PIN2', 'CONTACT_7_ZIPCODE',
       'CONTACT_7_STATE', 'CONTACT_7_CITY', 'CONTACT_7_NAME', 'CONTACT_7_TYPE',
       'CONTACT_6_ZIPCODE', 'CONTACT_6_STATE', 'CONTACT_6_CITY',
       'CONTACT_6_NAME', 'CONTACT_6_TYPE', 'CONTACT_5_ZIPCODE',
       'CONTACT_5_STATE', 'CONTACT_5_CITY', 'CONTACT_5_TYPE', 'CONTACT_5_NAME',
       'CONTACT_4_ZIPCODE', 'CONTACT_4_STATE', 'CONTACT_4_CITY',
       'CONTACT_4_NAME', 'CONTACT_4_TYPE', 'CONTACT_3_ZIPCODE',
       'CONTACT_3_STATE', 'CONTACT_3_CITY', 'CONTACT_3_NAME', 'CONTACT_3_TYPE',
       'CONTACT_2_ZIPCODE', 'CONTACT_2_STATE', 'CONTACT_2_CITY',
       'CONTACT_2_NAME', 'CONTACT_2_TYPE'],
      dtype='object')
In [ ]:
#Drop the non-valid columns
chicago_clean=chicago_df.drop(columns=col_non_valid.index)
chicago_clean;

Let's drop columns that do not hold valuable information for the analysis

In [ ]:
chicago_wip=chicago_clean.drop(columns=['ID','PERMIT#','PIN1','XCOORDINATE','YCOORDINATE'])
chicago_wip.shape
Out[ ]:
(730511, 35)

Let's drop columns that might hold useful information but will be ommited from the initial EDA

In [ ]:
chicago_wip2=chicago_wip.drop(columns=['CONTACT_1_STATE','CONTACT_1_ZIPCODE','SUFFIX','CONTACT_1_ZIPCODE','CONTACT_1_NAME','LOCATION'])
print(chicago_wip2.shape)
chicago_wip2;
(730511, 30)
In [ ]:
chicago_wip3=chicago_wip2.drop(columns=['STREET_NAME','CONTACT_1_CITY','STREET_NAME','OTHER_FEE_PAID','STREET DIRECTION','ZONING_FEE_PAID','SUBTOTAL_WAIVED','ZONING_FEE_UNPAID','OTHER_FEE_WAIVED',\
                                        'BUILDING_FEE_UNPAID','OTHER_FEE_UNPAID','BUILDING_FEE_WAIVED'])
chicago_wip3
Out[ ]:
PERMIT_TYPE REVIEW_TYPE APPLICATION_START_DATE ISSUE_DATE PROCESSING_TIME STREET_NUMBER WORK_DESCRIPTION BUILDING_FEE_PAID SUBTOTAL_PAID SUBTOTAL_UNPAID ZONING_FEE_WAIVED TOTAL_FEE CONTACT_1_TYPE REPORTED_COST COMMUNITY_AREA CENSUS_TRACT WARD LATITUDE LONGITUDE
0 PERMIT - RENOVATION/ALTERATION STANDARD PLAN REVIEW 10/14/2005 01/03/2006 81.0 2728 INTERIOR REMODELING OF EXISTING 3 D.U. PER PLA... 125.0 200.0 0.0 0.0 200.0 OWNER AS GENERAL CONTRACTOR 3000.0 NaN NaN NaN 41.930957 -87.718543
1 PERMIT - NEW CONSTRUCTION STANDARD PLAN REVIEW 12/05/2005 03/29/2006 114.0 1409 new construction, erect 3 story slab on grade ... 1029.6 1242.1 0.0 0.0 1242.1 ARCHITECT 190000.0 NaN NaN NaN 41.862155 -87.728575
2 PERMIT - SIGNS SIGN PERMIT 10/17/2005 01/12/2006 87.0 120 north elevation 386431 56.0 56.0 0.0 0.0 56.0 CONTRACTOR-ELECTRICAL 0.0 NaN NaN NaN 41.880356 -87.632466
3 PERMIT - RENOVATION/ALTERATION SELF CERT 01/25/2006 01/26/2006 1.0 71 Interior alteration of existing partial 34th f... 2408.0 2483.0 0.0 0.0 2483.0 SELF CERT ARCHITECT 125000.0 NaN NaN NaN 41.880718 -87.636589
4 PERMIT - RENOVATION/ALTERATION SELF CERT 01/21/2006 01/21/2006 0.0 4901 Interior alterations to the retail space on th... 2443.0 2518.0 0.0 0.0 2518.0 SELF CERT ARCHITECT 41527.0 NaN NaN NaN 41.803882 -87.742911
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
730506 PERMIT - EASY PERMIT PROCESS EASY PERMIT 02/15/2023 02/24/2023 9.0 6926 SOLAR PANEL PERMIT. ERECT A PHOTOVOLTAIC ARRA... 0.0 0.0 275.0 0.0 275.0 STRUCTURAL ENGINEER 18031.0 65.0 650500.0 13.0 41.766834 -87.725048
730507 PERMIT - EASY PERMIT PROCESS EASY PERMIT WEB 02/24/2023 02/24/2023 0.0 2529 FENCES: 165FT 0IN X 6FT 0IN: QTY 1 (not to be ... 150.0 150.0 0.0 0.0 150.0 OWNER AS GENERAL CONTRACTOR 5900.0 22.0 220400.0 32.0 41.927464 -87.701016
730508 PERMIT - ELECTRIC WIRING EASY PERMIT WEB 02/24/2023 02/24/2023 0.0 1125 TECH WILL PULL 100 CAT-6 CABLES, TERMINATE, AN... 75.0 75.0 0.0 0.0 75.0 CONTRACTOR-ELECTRICAL 9000.0 8.0 81202.0 2.0 41.902604 -87.628254
730509 PERMIT - EASY PERMIT PROCESS EASY PERMIT 02/24/2023 02/24/2023 0.0 3328 REVISION TO PERMIT NO 100988130 TO CHANGE ELEC... 75.0 75.0 0.0 0.0 75.0 CONTRACTOR-ELECTRICAL 1.0 27.0 837300.0 24.0 41.871944 -87.709470
730510 PERMIT - ELECTRIC WIRING EASY PERMIT WEB 02/24/2023 02/24/2023 0.0 12423 REPAIR SERVICE 75.0 75.0 0.0 0.0 75.0 CONTRACTOR-ELECTRICAL 500.0 53.0 530503.0 34.0 41.668048 -87.637825

730511 rows × 19 columns

Let's check the null values for the remaining columns

In [ ]:
#
chicago_wip3.isnull().sum()*100/chicago_wip3.shape[0]
Out[ ]:
PERMIT_TYPE                0.000000
REVIEW_TYPE                0.000958
APPLICATION_START_DATE     0.071457
ISSUE_DATE                 0.000000
PROCESSING_TIME            0.071457
STREET_NUMBER              0.000000
WORK_DESCRIPTION           0.007255
BUILDING_FEE_PAID          0.000000
SUBTOTAL_PAID              0.000000
SUBTOTAL_UNPAID            0.000000
ZONING_FEE_WAIVED          0.000000
TOTAL_FEE                  0.000000
CONTACT_1_TYPE             0.653378
REPORTED_COST              0.051060
COMMUNITY_AREA            14.351735
CENSUS_TRACT              14.927770
WARD                      14.380482
LATITUDE                   0.308551
LONGITUDE                  0.308551
dtype: float64
In [ ]:
chicago_wip3.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730511 entries, 0 to 730510
Data columns (total 19 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   PERMIT_TYPE             730511 non-null  object 
 1   REVIEW_TYPE             730504 non-null  object 
 2   APPLICATION_START_DATE  729989 non-null  object 
 3   ISSUE_DATE              730511 non-null  object 
 4   PROCESSING_TIME         729989 non-null  float64
 5   STREET_NUMBER           730511 non-null  int64  
 6   WORK_DESCRIPTION        730458 non-null  object 
 7   BUILDING_FEE_PAID       730511 non-null  float64
 8   SUBTOTAL_PAID           730511 non-null  float64
 9   SUBTOTAL_UNPAID         730511 non-null  float64
 10  ZONING_FEE_WAIVED       730511 non-null  float64
 11  TOTAL_FEE               730511 non-null  float64
 12  CONTACT_1_TYPE          725738 non-null  object 
 13  REPORTED_COST           730138 non-null  float64
 14  COMMUNITY_AREA          625670 non-null  float64
 15  CENSUS_TRACT            621462 non-null  float64
 16  WARD                    625460 non-null  float64
 17  LATITUDE                728257 non-null  float64
 18  LONGITUDE               728257 non-null  float64
dtypes: float64(12), int64(1), object(6)
memory usage: 105.9+ MB
In [ ]:
chicago_wip3.describe()
Out[ ]:
PROCESSING_TIME STREET_NUMBER BUILDING_FEE_PAID SUBTOTAL_PAID SUBTOTAL_UNPAID ZONING_FEE_WAIVED TOTAL_FEE REPORTED_COST COMMUNITY_AREA CENSUS_TRACT WARD LATITUDE LONGITUDE
count 729989.000000 730511.000000 730511.000000 730511.000000 7.305110e+05 730511.000000 7.305110e+05 7.301380e+05 625670.000000 621462.000000 625460.000000 728257.000000 728257.000000
mean 22.629837 3459.539303 675.572852 808.444783 1.706921e+01 1.817716 9.478778e+02 2.179828e+05 31.532530 310674.564295 27.169176 41.870021 -87.672919
std 105.683331 2978.569953 5775.639286 6944.309270 2.928526e+03 135.210814 1.157143e+04 1.836746e+07 21.942272 317214.267549 15.097453 0.083764 0.059634
min -2876.000000 1.000000 0.000000 0.000000 -1.440000e+04 0.000000 -1.152747e+04 -1.000000e+03 0.000000 0.000000 1.000000 41.644670 -87.940225
25% 0.000000 946.000000 58.000000 75.000000 0.000000e+00 0.000000 7.500000e+01 5.000000e+02 12.000000 7207.000000 14.000000 41.803620 -87.707301
50% 0.000000 2714.000000 150.000000 200.000000 0.000000e+00 0.000000 2.250000e+02 4.500000e+03 28.000000 210400.000000 29.000000 41.885830 -87.662388
75% 8.000000 5346.000000 400.000000 450.000000 0.000000e+00 0.000000 5.000000e+02 2.000000e+04 46.000000 630400.000000 42.000000 41.930764 -87.631931
max 5699.000000 111601.000000 882191.200000 999683.920000 1.944425e+06 102250.000000 5.772092e+06 1.000000e+10 77.000000 980100.000000 50.000000 42.022780 -87.524677

Change data types

In [ ]:
chicago_wip3['ISSUE_DATE']=pd.to_datetime(chicago_wip3['ISSUE_DATE'])
chicago_wip3['APPLICATION_START_DATE']=pd.to_datetime(chicago_wip3['APPLICATION_START_DATE'])
In [ ]:
chicago_wip3.nunique()
Out[ ]:
PERMIT_TYPE                   11
REVIEW_TYPE                   11
APPLICATION_START_DATE      6832
ISSUE_DATE                  6171
PROCESSING_TIME             1697
STREET_NUMBER              10690
WORK_DESCRIPTION          563792
BUILDING_FEE_PAID          49426
SUBTOTAL_PAID              53591
SUBTOTAL_UNPAID             1307
ZONING_FEE_WAIVED            137
TOTAL_FEE                  56498
CONTACT_1_TYPE                26
REPORTED_COST              45221
COMMUNITY_AREA                78
CENSUS_TRACT                1671
WARD                          50
LATITUDE                  246124
LONGITUDE                 246079
dtype: int64

TOTAL NUMBER OF APPLICATIONS. IS OUR DATASET COMPLETE?¶

Let's check if we have consistent numbers of application for all years & wards

In [ ]:
chicago_wip3.groupby(chicago_wip3['ISSUE_DATE'].dt.year).count()['PERMIT_TYPE'].plot(figsize=(10,5),grid=True)
Out[ ]:
<AxesSubplot:xlabel='ISSUE_DATE'>
In [ ]:
chicago_wip3.groupby([chicago_wip3['ISSUE_DATE'].dt.year,chicago_wip3['ISSUE_DATE'].dt.month]).count()['PERMIT_TYPE'].plot(figsize=(20,10),grid=True)
Out[ ]:
<AxesSubplot:xlabel='ISSUE_DATE,ISSUE_DATE'>
In [ ]:
yearly_wards=chicago_wip3.groupby([chicago_wip3['ISSUE_DATE'].dt.year,'WARD']).count()['PERMIT_TYPE']
In [ ]:
yearly_wards.unstack().T.isna().sum()
Out[ ]:
ISSUE_DATE
2006    47
2007    25
2008     0
2009     0
2010     0
2011     0
2012     0
2013     0
2014     0
2015     0
2016     0
2017     0
2018     0
2019     0
2020     0
2021     0
2022     0
2023     0
dtype: int64

Let's only consider data from 2008 onwards. The data before 2008 appears incomplete

In [ ]:
[chicago_wip3['ISSUE_DATE'].dt.year>=2008]
Out[ ]:
[0         False
 1         False
 2         False
 3         False
 4         False
           ...  
 730506     True
 730507     True
 730508     True
 730509     True
 730510     True
 Name: ISSUE_DATE, Length: 730511, dtype: bool]
In [ ]:
chicago_wip4=chicago_wip3[chicago_wip3['ISSUE_DATE'].dt.year>=2008]
In [ ]:
chicago_wip4.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 635827 entries, 20 to 730510
Data columns (total 19 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   PERMIT_TYPE             635827 non-null  object        
 1   REVIEW_TYPE             635820 non-null  object        
 2   APPLICATION_START_DATE  635664 non-null  datetime64[ns]
 3   ISSUE_DATE              635827 non-null  datetime64[ns]
 4   PROCESSING_TIME         635664 non-null  float64       
 5   STREET_NUMBER           635827 non-null  int64         
 6   WORK_DESCRIPTION        635807 non-null  object        
 7   BUILDING_FEE_PAID       635827 non-null  float64       
 8   SUBTOTAL_PAID           635827 non-null  float64       
 9   SUBTOTAL_UNPAID         635827 non-null  float64       
 10  ZONING_FEE_WAIVED       635827 non-null  float64       
 11  TOTAL_FEE               635827 non-null  float64       
 12  CONTACT_1_TYPE          632737 non-null  object        
 13  REPORTED_COST           635816 non-null  float64       
 14  COMMUNITY_AREA          625605 non-null  float64       
 15  CENSUS_TRACT            621397 non-null  float64       
 16  WARD                    625395 non-null  float64       
 17  LATITUDE                633882 non-null  float64       
 18  LONGITUDE               633882 non-null  float64       
dtypes: datetime64[ns](2), float64(12), int64(1), object(4)
memory usage: 97.0+ MB
In [ ]:
chicago_wip4.nunique()
Out[ ]:
PERMIT_TYPE                   11
REVIEW_TYPE                   11
APPLICATION_START_DATE      6030
ISSUE_DATE                  5502
PROCESSING_TIME             1408
STREET_NUMBER              10546
WORK_DESCRIPTION          492558
BUILDING_FEE_PAID          42706
SUBTOTAL_PAID              46792
SUBTOTAL_UNPAID             1179
ZONING_FEE_WAIVED            123
TOTAL_FEE                  49062
CONTACT_1_TYPE                26
REPORTED_COST              42204
COMMUNITY_AREA                78
CENSUS_TRACT                1671
WARD                          50
LATITUDE                  219246
LONGITUDE                 219218
dtype: int64
In [ ]:
chicago_wip4['CENSUS_TRACT']
Out[ ]:
20             NaN
32        611900.0
33        830600.0
34        170600.0
46        100500.0
            ...   
730506    650500.0
730507    220400.0
730508     81202.0
730509    837300.0
730510    530503.0
Name: CENSUS_TRACT, Length: 635827, dtype: float64

High Level Overview¶

In [ ]:
yearly_wards=chicago_wip4.groupby([(chicago_wip4['ISSUE_DATE']).dt.year,'WARD']).count()['PERMIT_TYPE']
In [ ]:
yearly_wards.unstack().plot(figsize=(30,15),grid=True,title='Total Number of Applications by Ward')
Out[ ]:
<AxesSubplot:title={'center':'Total Number of Applications by Ward'}, xlabel='ISSUE_DATE'>
In [ ]:
reported_cost_wards_total=chicago_wip4.groupby([chicago_wip3['ISSUE_DATE'].dt.year,'WARD'])['REPORTED_COST'].sum()
In [ ]:
reported_cost_wards=chicago_wip4.groupby([chicago_wip3['ISSUE_DATE'].dt.year,'WARD'])['REPORTED_COST'].sum()/chicago_wip4.groupby([chicago_wip3['ISSUE_DATE'].dt.year,'WARD'])['REPORTED_COST'].count()
In [ ]:
reported_cost_wards.unstack().plot(figsize=(30,15), grid=True, title='AVG reported cost per application by ward')
Out[ ]:
<AxesSubplot:title={'center':'AVG reported cost per application by ward'}, xlabel='ISSUE_DATE'>
In [ ]:
reported_cost_community=chicago_wip4.groupby([chicago_wip3['ISSUE_DATE'].dt.year,'COMMUNITY_AREA'])['REPORTED_COST'].sum()/chicago_wip4.groupby([chicago_wip3['ISSUE_DATE'].dt.year,'COMMUNITY_AREA'])['REPORTED_COST'].count()
In [ ]:
reported_cost_community.unstack().plot(figsize=(30,15), grid=True, title='AVG reported cost per application by community area');
In [ ]:
#What types of Permit Types are there?
chicago_wip4['PERMIT_TYPE'].unique()
Out[ ]:
array(['PERMIT - SIGNS', 'PERMIT - WRECKING/DEMOLITION',
       'PERMIT - EASY PERMIT PROCESS', 'PERMIT - RENOVATION/ALTERATION',
       'PERMIT - ELECTRIC WIRING', 'PERMIT - NEW CONSTRUCTION',
       'PERMIT - ELEVATOR EQUIPMENT', 'PERMIT - REINSTATE REVOKED PMT',
       'PERMIT - SCAFFOLDING', 'PERMIT - FOR EXTENSION OF PMT',
       'PERMIT - PORCH CONSTRUCTION'], dtype=object)
In [ ]:
chicago_wip4.groupby('PERMIT_TYPE').count()['ISSUE_DATE'].plot(kind='bar')
Out[ ]:
<AxesSubplot:xlabel='PERMIT_TYPE'>
In [ ]:
#Can text processing be applied for this column to find any patterns/ form clusters?
chicago_wip4['WORK_DESCRIPTION'].sample(20)
Out[ ]:
76774            HOT WATER HEATER (REPLACEMENT ONLY): QTY 1
454393           REPLACE WIRING, OUTLETS AND LIGHT FIXTURES
427136    REPLACEMENT OF FRONT WOOD PORCH IN SAME LOCATI...
86309     REPLACE 5 EXISTING GLASS BLOCK. REPAIR CONCRET...
183000              INSTALLATION OF ELECTRICAL TIMER SWITCH
81463                    INSTALL LOW VOLTAGE BURGLAR ALARM.
325615    SELF CERT: INTERIOR ALTERATIONS TO 32ND FLOOR ...
76703                                                GARAGE
484130                                               GARAGE
533248    ADDRESS CODE VIOLATIONS ALL NEW DRYWALL EW FIX...
637941                     INSTALL LOW VOLTAGE SOUND SYSTEM
170057    INSTALLATION OF LOW VOLTAGE BURGLAR ALARM # 34...
270098    SEPTEMBER MAINTENANCE/COMPUTER/PHONE/WIRELESS ...
287470    ALTERATIONS TO FIRST FLOOR TENANT LOBBY AND 5T...
60886         INTERIOR RENOVATION AND REAR 2 STORY ADDITION
174204    CONCRETE PATCHING & REPAIRS AT GRADE LEVEL BUI...
611780    All Elevation, repair/replace sealant 6000 Lin...
207138    GENERAL MONTHLY MAINTENANCE, NOVEMBER, 2011; J...
682854    INTERIOR ALTERATIONS & REPLACE ENCLOSED FRAME ...
425371                                               GARAGE
Name: WORK_DESCRIPTION, dtype: object

How can we divide the data geographically?¶

Option 1: Great (but would require linking it to US Census Database)

[Defintion] Census Tracts are small, relatively permanent statistical subdivisions of a county or statistically equivalent entity that can be updated by local participants prior to each decennial census as part of the Census Bureau's Participant Statistical Areas Program (PSAP) https://www.census.gov/programs-surveys/geography/about/glossary.html#:~:text=Census%20Tracts%20are%20small%2C%20relatively,Statistical%20Areas%20Program%20(PSAP).

Option 2: Divide by WARD or COMMUNITY_AREA

In [ ]:
chicago_wip4['WARD'].unique()
Out[ ]:
array([nan, 20., 40., 38., 41., 24., 23., 29., 11., 46.,  8., 15., 42.,
       43.,  2., 45., 27., 25., 39., 48.,  1., 44.,  7.,  6., 14., 37.,
       32.,  4., 35., 21., 10., 50., 33., 22., 47., 26.,  9.,  5., 34.,
       19., 12., 17., 13., 18.,  3., 31., 28., 49., 36., 30., 16.])
In [ ]:
chicago_wip4['COMMUNITY_AREA'].unique()
Out[ ]:
array([nan, 61.,  1., 17., 10., 29., 76., 65., 15., 25.,  3., 43., 66.,
        8., 28., 33., 34., 13., 16., 77., 24., 32., 57.,  6., 69., 58.,
       22., 21., 71., 46.,  2., 14., 51., 11.,  5., 20., 53., 44.,  7.,
       30., 60., 75., 12., 64., 72., 63., 48., 73., 70., 59., 37., 56.,
        4., 39., 35., 49., 42., 23., 27., 19., 45., 50., 74., 68.,  0.,
       52., 31., 67., 55., 41., 47.,  9., 18., 26., 62., 38., 40., 54.,
       36.])

Ultimately, the gropgraphic division will depend on the structure of the target variable dataset

Let's load out current target variable dataset.

In [ ]:
prices_df_raw=pd.read_csv('../src/data/Chicago_price_index_data.csv')
In [ ]:
prices_df_raw.head();
In [ ]:
#let's drop unnecessary columns
prices_df_wip=prices_df.drop(columns=['COOK','SUBURBS'])
prices_df_wip.head()
Out[ ]:
YEARQ CHICAGO Palatine/Barrington Melrose Park/Maywood Oak Park/Cicero LaGrange/Burbank Orland Park/Lemont Oak Lawn/Blue Island Oak Forest/Country Club Hills Calumet City/Harvey ... Chicago--Logan Square/Avondale Chicago--Humboldt Park/Garfield Park Chicago--West Town/Near West Side Chicago--Bridgeport/Brighton Park Chicago--Gage Park/West Lawn Chicago--Englewood/Greater Grand Crossing Chicago--Bronzeville/Hyde Park Chicago--Beverly/Morgan Park Chicago--Auburn Gresham/Chatham Chicago--South Chicago/West Pullman
0 1997Q1 80.51 88.45 90.59 82.87 84.73 92.37 92.80 93.44 91.80 ... 70.60 82.13 70.66 79.70 88.14 85.80 61.49 81.30 80.24 84.16
1 1997Q2 81.74 88.13 90.05 85.86 87.83 91.98 92.88 94.72 93.23 ... 72.17 86.80 70.21 82.34 89.83 86.00 72.59 85.60 84.36 90.44
2 1997Q3 83.66 88.66 90.26 86.83 89.65 93.76 93.68 95.50 93.36 ... 73.73 92.90 69.67 86.17 90.24 86.01 76.99 87.86 86.57 90.62
3 1997Q4 84.18 88.37 91.27 86.14 90.78 94.15 94.09 95.47 93.54 ... 72.93 93.54 68.52 88.45 90.15 87.37 78.16 89.31 87.62 89.88
4 1998Q1 85.49 88.83 90.71 86.89 91.34 94.44 94.40 95.97 93.71 ... 74.22 95.94 69.15 91.34 90.62 89.48 85.44 91.14 89.50 90.64

5 rows × 35 columns

Current dataset appears to be divided by COMMUNITY AREAS

In [ ]:
year=[year_.split('Q')[0] for year_ in prices_df_wip['YEARQ']]
year=[int(y) for y in year]
prices_df_wip.insert(1,'YEAR',year)
In [ ]:
quarter=[year_.split('Q')[1] for year_ in prices_df_wip['YEARQ']]
quarter=[int(q) for q in quarter]
prices_df_wip.insert(2,'QUARTER',quarter)
In [ ]:
prices_df_wip.head();
In [ ]:
prices_df_wip2=prices_df_wip[prices_df_wip['YEAR']>=2008]
In [ ]:
prices_df_wip3=prices_df_wip2.drop(columns='YEARQ')
In [ ]:
prices_df_wip3.columns[3:]
Out[ ]:
Index(['Palatine/Barrington', 'Melrose Park/Maywood', 'Oak Park/Cicero',
       'LaGrange/Burbank', 'Orland Park/Lemont', 'Oak Lawn/Blue Island',
       'Oak Forest/Country Club Hills', 'Calumet City/Harvey',
       'Chicago Heights/Park Forest', 'Arlington Heights/Wheeling',
       'Winnetka/Northbrook', 'Hoffman Estates/Streamwood', 'Schaumburg',
       'Mount Prospect/Elk Grove Village', 'Park Ridge/Des Plaines',
       'Evanston/Skokie', 'Elmwood Park/Franklin Park',
       'Chicago--Uptown/Rogers Park', 'Chicago--Lake View/Lincoln Park',
       'Chicago--Lincoln Square/North Center',
       'Chicago--Irving Park/Albany Park',
       'Chicago--Portage Park/Jefferson Park',
       'Chicago--Austin/Belmont Cragin', 'Chicago--Logan Square/Avondale',
       'Chicago--Humboldt Park/Garfield Park',
       'Chicago--West Town/Near West Side',
       'Chicago--Bridgeport/Brighton Park', 'Chicago--Gage Park/West Lawn',
       'Chicago--Englewood/Greater Grand Crossing',
       'Chicago--Bronzeville/Hyde Park', 'Chicago--Beverly/Morgan Park',
       'Chicago--Auburn Gresham/Chatham',
       'Chicago--South Chicago/West Pullman'],
      dtype='object')
In [ ]:
#Let's select neighbourhood names NOT starting with Chicago
non_chicago=[name for name in prices_df_wip3.columns[3:] if name[0:7]!='Chicago']
non_chicago
Out[ ]:
['Palatine/Barrington',
 'Melrose Park/Maywood',
 'Oak Park/Cicero',
 'LaGrange/Burbank',
 'Orland Park/Lemont',
 'Oak Lawn/Blue Island',
 'Oak Forest/Country Club Hills',
 'Calumet City/Harvey',
 'Arlington Heights/Wheeling',
 'Winnetka/Northbrook',
 'Hoffman Estates/Streamwood',
 'Schaumburg',
 'Mount Prospect/Elk Grove Village',
 'Park Ridge/Des Plaines',
 'Evanston/Skokie',
 'Elmwood Park/Franklin Park']
In [ ]:
prices_df_wip4=prices_df_wip3.drop(columns=non_chicago)

We need to assign a Community Area number to each column to be able to match it with the main (permit applications) dataset

In [ ]:
prices_df_wip4.head()
Out[ ]:
YEAR QUARTER CHICAGO Chicago Heights/Park Forest Chicago--Uptown/Rogers Park Chicago--Lake View/Lincoln Park Chicago--Lincoln Square/North Center Chicago--Irving Park/Albany Park Chicago--Portage Park/Jefferson Park Chicago--Austin/Belmont Cragin Chicago--Logan Square/Avondale Chicago--Humboldt Park/Garfield Park Chicago--West Town/Near West Side Chicago--Bridgeport/Brighton Park Chicago--Gage Park/West Lawn Chicago--Englewood/Greater Grand Crossing Chicago--Bronzeville/Hyde Park Chicago--Beverly/Morgan Park Chicago--Auburn Gresham/Chatham Chicago--South Chicago/West Pullman
44 2008 1 199.90 155.62 186.57 169.84 202.53 189.77 186.95 215.63 218.86 278.41 227.74 234.84 201.88 242.33 239.50 186.94 187.41 187.56
45 2008 2 194.15 148.34 185.11 173.84 199.33 185.57 179.89 208.45 219.68 262.04 225.33 229.64 194.27 231.29 233.37 179.23 183.09 182.65
46 2008 3 185.49 140.11 173.78 169.24 192.16 178.29 170.94 195.23 219.40 232.32 225.68 222.22 184.73 213.16 216.28 172.93 176.09 171.06
47 2008 4 177.70 133.30 177.28 165.41 188.11 173.33 164.78 177.94 213.48 198.29 223.29 217.12 175.37 202.28 198.29 165.64 169.20 164.49
48 2009 1 168.42 123.70 172.92 165.58 183.28 168.44 161.09 163.10 207.91 170.17 215.20 202.45 166.22 175.62 182.58 157.68 158.34 157.86
In [ ]:
prices_df_wip4.reset_index(inplace=True)
In [ ]:
prices_df_wip5=prices_df_wip4.drop(columns=['index','Chicago Heights/Park Forest'])

Need to assign the community area number to each column

In [ ]:
prices_df_wip5.head()
Out[ ]:
YEAR QUARTER CHICAGO Chicago--Uptown/Rogers Park Chicago--Lake View/Lincoln Park Chicago--Lincoln Square/North Center Chicago--Irving Park/Albany Park Chicago--Portage Park/Jefferson Park Chicago--Austin/Belmont Cragin Chicago--Logan Square/Avondale Chicago--Humboldt Park/Garfield Park Chicago--West Town/Near West Side Chicago--Bridgeport/Brighton Park Chicago--Gage Park/West Lawn Chicago--Englewood/Greater Grand Crossing Chicago--Bronzeville/Hyde Park Chicago--Beverly/Morgan Park Chicago--Auburn Gresham/Chatham Chicago--South Chicago/West Pullman
0 2008 1 199.90 186.57 169.84 202.53 189.77 186.95 215.63 218.86 278.41 227.74 234.84 201.88 242.33 239.50 186.94 187.41 187.56
1 2008 2 194.15 185.11 173.84 199.33 185.57 179.89 208.45 219.68 262.04 225.33 229.64 194.27 231.29 233.37 179.23 183.09 182.65
2 2008 3 185.49 173.78 169.24 192.16 178.29 170.94 195.23 219.40 232.32 225.68 222.22 184.73 213.16 216.28 172.93 176.09 171.06
3 2008 4 177.70 177.28 165.41 188.11 173.33 164.78 177.94 213.48 198.29 223.29 217.12 175.37 202.28 198.29 165.64 169.20 164.49
4 2009 1 168.42 172.92 165.58 183.28 168.44 161.09 163.10 207.91 170.17 215.20 202.45 166.22 175.62 182.58 157.68 158.34 157.86
In [ ]:
#for each column name [1] store the series, [2] 
In [ ]:
di={col:col.split('--')[1] for col in prices_df_wip5.columns[3:]}
di;
In [ ]:
prices_df_wip6=prices_df_wip5.rename(columns=di)
In [ ]:
prices_df_wip6.head()
Out[ ]:
YEAR QUARTER CHICAGO Uptown/Rogers Park Lake View/Lincoln Park Lincoln Square/North Center Irving Park/Albany Park Portage Park/Jefferson Park Austin/Belmont Cragin Logan Square/Avondale Humboldt Park/Garfield Park West Town/Near West Side Bridgeport/Brighton Park Gage Park/West Lawn Englewood/Greater Grand Crossing Bronzeville/Hyde Park Beverly/Morgan Park Auburn Gresham/Chatham South Chicago/West Pullman
0 2008 1 199.90 186.57 169.84 202.53 189.77 186.95 215.63 218.86 278.41 227.74 234.84 201.88 242.33 239.50 186.94 187.41 187.56
1 2008 2 194.15 185.11 173.84 199.33 185.57 179.89 208.45 219.68 262.04 225.33 229.64 194.27 231.29 233.37 179.23 183.09 182.65
2 2008 3 185.49 173.78 169.24 192.16 178.29 170.94 195.23 219.40 232.32 225.68 222.22 184.73 213.16 216.28 172.93 176.09 171.06
3 2008 4 177.70 177.28 165.41 188.11 173.33 164.78 177.94 213.48 198.29 223.29 217.12 175.37 202.28 198.29 165.64 169.20 164.49
4 2009 1 168.42 172.92 165.58 183.28 168.44 161.09 163.10 207.91 170.17 215.20 202.45 166.22 175.62 182.58 157.68 158.34 157.86
In [ ]:
prices_df_wip7=prices_df_wip6.copy()
for col in prices_df_wip6.columns[3:]:
    temp=prices_df_wip6[col]
    prices_df_wip7[col.split('/')[0]]=temp
    prices_df_wip7[col.split('/')[1]]=temp
    prices_df_wip7=prices_df_wip7.drop(columns=col)
In [ ]:
prices_df_wip7.head(1)
Out[ ]:
YEAR QUARTER CHICAGO Uptown Rogers Park Lake View Lincoln Park Lincoln Square North Center Irving Park ... Englewood Greater Grand Crossing Bronzeville Hyde Park Beverly Morgan Park Auburn Gresham Chatham South Chicago West Pullman
0 2008 1 199.9 186.57 186.57 169.84 169.84 202.53 202.53 189.77 ... 242.33 242.33 239.5 239.5 186.94 186.94 187.41 187.41 187.56 187.56

1 rows × 35 columns

In [ ]:
prices_df_wip7.columns
Out[ ]:
Index(['YEAR', 'QUARTER', 'CHICAGO', 'Uptown', 'Rogers Park', 'Lake View',
       'Lincoln Park', 'Lincoln Square', 'North Center', 'Irving Park',
       'Albany Park', 'Portage Park', 'Jefferson Park', 'Austin',
       'Belmont Cragin', 'Logan Square', 'Avondale', 'Humboldt Park',
       'Garfield Park', 'West Town', 'Near West Side', 'Bridgeport',
       'Brighton Park', 'Gage Park', 'West Lawn', 'Englewood',
       'Greater Grand Crossing', 'Bronzeville', 'Hyde Park', 'Beverly',
       'Morgan Park', 'Auburn Gresham', 'Chatham', 'South Chicago',
       'West Pullman'],
      dtype='object')
In [ ]:
map_dict ={'Uptown':3, 'Rogers Park':1,
       'Lake View':6, 'Lincoln Park':7, 'Lincoln Square':4, 'North Center':5,
       'Irving Park':16, 'Albany Park':14, 'Portage Park':15, 'Jefferson Park':11,
       'Austin':25, 'Belmont Cragin':19, 'Logan Square':22, 'Avondale':21, 'Humboldt Park':23,
       'Garfield Park':26, 'West Town':24, 'Near West Side':28, 'Bronzeville': 38, 'Bridgeport':60,
       'Brighton Park':58, 'Gage Park':63, 'West Lawn':65, 'Englewood':68,
       'Greater Grand Crossing':69, 'Hyde Park':41, 'Beverly':72,
       'Morgan Park':75, 'Auburn Gresham':71, 'Chatham':44, 'South Chicago':46,
       'West Pullman':53}
In [ ]:
prices_df_wip8=prices_df_wip7.rename(columns=map_dict)
In [ ]:
prices_df_wip8.columns
Out[ ]:
Index([   'YEAR', 'QUARTER', 'CHICAGO',         3,         1,         6,
               7,         4,         5,        16,        14,        15,
              11,        25,        19,        22,        21,        23,
              26,        24,        28,        60,        58,        63,
              65,        68,        69,        38,        41,        72,
              75,        71,        44,        46,        53],
      dtype='object')
In [ ]:
#prices_df_wip9=prices_df_wip8.drop(columns=['Heights','Park Forest','Bronzeville'])
In [ ]:
prices_df_wip8.groupby(['YEAR','QUARTER']).mean().plot(figsize=(30,15),grid=True)
Out[ ]:
<AxesSubplot:xlabel='YEAR,QUARTER'>
In [ ]:
areas=list(prices_df_wip8.columns)[3:]
In [ ]:
reported_cost_community2=chicago_wip4.groupby([chicago_wip4['ISSUE_DATE'].dt.year,chicago_wip4['ISSUE_DATE'].dt.quarter,'COMMUNITY_AREA'])['REPORTED_COST'].sum()/chicago_wip4.groupby([chicago_wip3['ISSUE_DATE'].dt.year,chicago_wip4['ISSUE_DATE'].dt.quarter,'COMMUNITY_AREA'])['REPORTED_COST'].count()
In [ ]:
reported_cost_community2.unstack()[areas].plot(figsize=(30,15), grid=True, title='AVG reported cost per application by community area');
In [ ]:
renov_df=chicago_wip4[chicago_wip4['PERMIT_TYPE']=='PERMIT - RENOVATION/ALTERATION']
In [ ]:
reported_cost_renov=renov_df.groupby([renov_df['ISSUE_DATE'].dt.year,renov_df['ISSUE_DATE'].dt.quarter,'COMMUNITY_AREA'])['REPORTED_COST'].sum()
In [ ]:
reported_cost_renov.unstack()[areas].plot(figsize=(30,15), grid=True, title='AVG reported cost per renovation application by community area');

ok, let's take the average of all predicted costs for each community and, then, calculate the percentage deviation from the average for each quarter

In [ ]:
reported_cost_renov
Out[ ]:
ISSUE_DATE  ISSUE_DATE  COMMUNITY_AREA
2008        1           1.0                 281000.0
                        2.0                 607600.0
                        3.0                 982000.0
                        4.0                 315250.0
                        5.0                1026460.0
                                             ...    
2023        1           73.0               1173000.0
                        74.0                857270.0
                        75.0               1398685.0
                        76.0              40112589.0
                        77.0               6861061.0
Name: REPORTED_COST, Length: 4649, dtype: float64
In [ ]:
reported_cost_renov2=reported_cost_renov.unstack()
reported_cost_renov2.head()
Out[ ]:
COMMUNITY_AREA 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 ... 68.0 69.0 70.0 71.0 72.0 73.0 74.0 75.0 76.0 77.0
ISSUE_DATE ISSUE_DATE
2008 1 NaN 281000.0 607600.0 982000.0 315250.0 1026460.0 3153300.0 3638001.0 38789306.0 230000.0 ... 186300.0 54000.0 132000.0 254788.5 189000.0 55694.0 120000.00 425000.0 5265000.0 659470.0
2 NaN 1748300.0 2962600.0 8330545.0 3889175.0 15183874.0 8453456.0 6514764.0 73355910.0 903000.0 ... 872432.5 608500.0 1003400.0 663161.0 2987500.0 188120.0 2290853.12 1719156.0 5292701.0 5368979.0
3 NaN 1177480.0 2347900.0 5618026.0 4004101.0 7515100.0 14441690.0 9078501.0 71911931.0 732900.0 ... 2574300.0 6033376.0 611041.0 1142500.0 773500.0 4315040.0 1548840.00 922000.0 6785600.0 5119980.0
4 NaN 4763102.0 1495120.0 6630023.0 6984001.0 2602700.0 15027153.0 9567755.0 261083044.0 165800.0 ... 722000.0 4199138.0 898800.0 3203069.0 891600.0 1317155.0 742000.00 2571375.0 4109706.0 11691700.0
2009 1 NaN 3077900.0 1094800.0 8337900.0 2045583.0 5945896.0 12917878.0 8293880.0 77050278.0 715000.0 ... 3320367.0 1027160.0 1831000.0 228800.0 41500.0 291800.0 2425000.00 940125.0 2720620.0 5403770.0

5 rows × 78 columns

In [ ]:
reported_cost_renov2.index[:8]
Out[ ]:
MultiIndex([(2008, 1),
            (2008, 2),
            (2008, 3),
            (2008, 4),
            (2009, 1),
            (2009, 2),
            (2009, 3),
            (2009, 4)],
           names=['ISSUE_DATE', 'ISSUE_DATE'])
In [ ]:
years=[]
quarters=[]
for i in reported_cost_renov2.index:
    years.append(i[0])
    quarters.append(i[1])
In [ ]:
reported_cost_renov3=reported_cost_renov2.reset_index(drop=True)
In [ ]:
reported_cost_renov3.drop(columns=0.0,inplace=True)
In [ ]:
reported_cost_renov3.insert(0,'YEAR',years)
In [ ]:
reported_cost_renov3.insert(1,'QUARTER',quarters)
In [ ]:
reported_cost_renov3.head()
Out[ ]:
COMMUNITY_AREA YEAR QUARTER 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 ... 68.0 69.0 70.0 71.0 72.0 73.0 74.0 75.0 76.0 77.0
0 2008 1 281000.0 607600.0 982000.0 315250.0 1026460.0 3153300.0 3638001.0 38789306.0 ... 186300.0 54000.0 132000.0 254788.5 189000.0 55694.0 120000.00 425000.0 5265000.0 659470.0
1 2008 2 1748300.0 2962600.0 8330545.0 3889175.0 15183874.0 8453456.0 6514764.0 73355910.0 ... 872432.5 608500.0 1003400.0 663161.0 2987500.0 188120.0 2290853.12 1719156.0 5292701.0 5368979.0
2 2008 3 1177480.0 2347900.0 5618026.0 4004101.0 7515100.0 14441690.0 9078501.0 71911931.0 ... 2574300.0 6033376.0 611041.0 1142500.0 773500.0 4315040.0 1548840.00 922000.0 6785600.0 5119980.0
3 2008 4 4763102.0 1495120.0 6630023.0 6984001.0 2602700.0 15027153.0 9567755.0 261083044.0 ... 722000.0 4199138.0 898800.0 3203069.0 891600.0 1317155.0 742000.00 2571375.0 4109706.0 11691700.0
4 2009 1 3077900.0 1094800.0 8337900.0 2045583.0 5945896.0 12917878.0 8293880.0 77050278.0 ... 3320367.0 1027160.0 1831000.0 228800.0 41500.0 291800.0 2425000.00 940125.0 2720620.0 5403770.0

5 rows × 79 columns

In [ ]:
avg_costs=reported_cost_renov3.iloc[:,2:].mean()
avg_costs
Out[ ]:
COMMUNITY_AREA
1.0     6.703444e+06
2.0     4.263048e+06
3.0     1.242579e+07
4.0     6.658883e+06
5.0     7.783004e+06
            ...     
73.0    1.639715e+06
74.0    2.094342e+06
75.0    2.545023e+06
76.0    1.297205e+08
77.0    7.733792e+06
Length: 77, dtype: float64
In [ ]:
reported_cost_renov4=reported_cost_renov3.copy()
for col in reported_cost_renov3.columns[2:]:
    reported_cost_renov4[col]=reported_cost_renov3[col]/avg_costs[col]
    
In [ ]:
reported_cost_renov4.tail(1)
Out[ ]:
COMMUNITY_AREA YEAR QUARTER 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 ... 68.0 69.0 70.0 71.0 72.0 73.0 74.0 75.0 76.0 77.0
60 2023 1 0.246773 0.499056 1.565268 0.39376 0.480358 0.361615 0.529522 0.249376 ... 0.121149 6.474478 0.17894 0.450188 0.390287 0.715368 0.409327 0.549577 0.309223 0.887154

1 rows × 79 columns

We have visualized the house prices per community area - but is it an actually meaningful variable? In other words, is there a stistically significant difference between how the prices were changing for each community?

In [ ]:
prices_df_wip8.head()
Out[ ]:
YEAR QUARTER CHICAGO 3 1 6 7 4 5 16 ... 68 69 38 41 72 75 71 44 46 53
0 2008 1 199.90 186.57 186.57 169.84 169.84 202.53 202.53 189.77 ... 242.33 242.33 239.50 239.50 186.94 186.94 187.41 187.41 187.56 187.56
1 2008 2 194.15 185.11 185.11 173.84 173.84 199.33 199.33 185.57 ... 231.29 231.29 233.37 233.37 179.23 179.23 183.09 183.09 182.65 182.65
2 2008 3 185.49 173.78 173.78 169.24 169.24 192.16 192.16 178.29 ... 213.16 213.16 216.28 216.28 172.93 172.93 176.09 176.09 171.06 171.06
3 2008 4 177.70 177.28 177.28 165.41 165.41 188.11 188.11 173.33 ... 202.28 202.28 198.29 198.29 165.64 165.64 169.20 169.20 164.49 164.49
4 2009 1 168.42 172.92 172.92 165.58 165.58 183.28 183.28 168.44 ... 175.62 175.62 182.58 182.58 157.68 157.68 158.34 158.34 157.86 157.86

5 rows × 35 columns

In [ ]:
city_change=(prices_df_wip8['CHICAGO'].shift(1)-prices_df_wip8['CHICAGO'])
#price_change=(prices_df_wip8.shift(1)-prices_df_wip8)[2:]
In [ ]:
prices_df_wip8.head()
Out[ ]:
YEAR QUARTER CHICAGO 3 1 6 7 4 5 16 ... 68 69 38 41 72 75 71 44 46 53
0 2008 1 199.90 186.57 186.57 169.84 169.84 202.53 202.53 189.77 ... 242.33 242.33 239.50 239.50 186.94 186.94 187.41 187.41 187.56 187.56
1 2008 2 194.15 185.11 185.11 173.84 173.84 199.33 199.33 185.57 ... 231.29 231.29 233.37 233.37 179.23 179.23 183.09 183.09 182.65 182.65
2 2008 3 185.49 173.78 173.78 169.24 169.24 192.16 192.16 178.29 ... 213.16 213.16 216.28 216.28 172.93 172.93 176.09 176.09 171.06 171.06
3 2008 4 177.70 177.28 177.28 165.41 165.41 188.11 188.11 173.33 ... 202.28 202.28 198.29 198.29 165.64 165.64 169.20 169.20 164.49 164.49
4 2009 1 168.42 172.92 172.92 165.58 165.58 183.28 183.28 168.44 ... 175.62 175.62 182.58 182.58 157.68 157.68 158.34 158.34 157.86 157.86

5 rows × 35 columns

In [ ]:
pr_change_df=prices_df_wip8.iloc[:,0:2]
pr_change_df.head()
Out[ ]:
YEAR QUARTER
0 2008 1
1 2008 2
2 2008 3
3 2008 4
4 2009 1
In [ ]:
#need to divide each field by city_change to create a new column in the new df pr_change_df
for col in prices_df_wip8.columns[2:]:
    pr_change_df[col]=(prices_df_wip8[col].shift(1)-prices_df_wip8[col])/city_change
In [ ]:
pr_change_df2=pr_change_df.drop(index=0).reset_index().drop(columns='index')
pr_change_df2.tail()
Out[ ]:
YEAR QUARTER CHICAGO 3 1 6 7 4 5 16 ... 68 69 38 41 72 75 71 44 46 53
52 2021 2 1.0 0.073409 0.073409 1.615008 1.615008 1.504078 1.504078 1.336052 ... 1.081566 1.081566 1.019576 1.019576 1.181077 1.181077 1.122349 1.122349 0.603589 0.603589
53 2021 3 1.0 0.431818 0.431818 0.708556 0.708556 0.707219 0.707219 0.754011 ... 1.516043 1.516043 1.609626 1.609626 1.036096 1.036096 1.508021 1.508021 1.402406 1.402406
54 2021 4 1.0 0.305825 0.305825 0.359223 0.359223 0.517799 0.517799 1.079288 ... 1.407767 1.407767 1.389968 1.389968 1.213592 1.213592 1.064725 1.064725 1.487055 1.487055
55 2022 1 1.0 0.293814 0.293814 0.146907 0.146907 0.346649 0.346649 1.054124 ... 2.233247 2.233247 0.971649 0.971649 1.034794 1.034794 1.369845 1.369845 1.278351 1.278351
56 2022 2 1.0 1.804494 1.804494 0.434831 0.434831 0.905618 0.905618 0.957303 ... 1.422472 1.422472 1.229213 1.229213 0.924719 0.924719 1.451685 1.451685 0.892135 0.892135

5 rows × 35 columns

In [ ]:
reported_cost_renov4.tail(1)
Out[ ]:
COMMUNITY_AREA YEAR QUARTER 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 ... 68.0 69.0 70.0 71.0 72.0 73.0 74.0 75.0 76.0 77.0
60 2023 1 0.246773 0.499056 1.565268 0.39376 0.480358 0.361615 0.529522 0.249376 ... 0.121149 6.474478 0.17894 0.450188 0.390287 0.715368 0.409327 0.549577 0.309223 0.887154

1 rows × 79 columns

In [ ]:
pr_change_df2.iloc[:,2:].plot(figsize=(20,10),grid=True)
Out[ ]:
<AxesSubplot:>

Prediction 1: let's suppose increased applications cause accelerate price change in one year

TOTAL MESS STARTS HERE¶

In [ ]:
#Ok these are our price changes (target variable) - the last quarter that we have data for is quarter 2 2022
#Hence for our renovation costs we should only have data up until quarter 2 2021
In [ ]:
pr_change_df2.head(1)
Out[ ]:
YEAR QUARTER CHICAGO 3 1 6 7 4 5 16 ... 68 69 38 41 72 75 71 44 46 53
0 2008 2 1.0 0.253913 0.253913 -0.695652 -0.695652 0.556522 0.556522 0.730435 ... 1.92 1.92 1.066087 1.066087 1.34087 1.34087 0.751304 0.751304 0.853913 0.853913

1 rows × 35 columns

In [ ]:
pr_change_df2[]
Out[ ]:
YEAR QUARTER CHICAGO 3 1 6 7 4 5 16 ... 68 69 38 41 72 75 71 44 46 53
0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 2008.0 2.0 1.0 0.253913 0.253913 -0.695652 -0.695652 0.556522 0.556522 0.730435 ... 1.92 1.92 1.066087 1.066087 1.34087 1.34087 0.751304 0.751304 0.853913 0.853913

5 rows × 35 columns

In [ ]:
pr_change_df2.tail(1)
Out[ ]:
YEAR QUARTER CHICAGO 3 1 6 7 4 5 16 ... 68 69 38 41 72 75 71 44 46 53
56 2022 2 1.0 1.804494 1.804494 0.434831 0.434831 0.905618 0.905618 0.957303 ... 1.422472 1.422472 1.229213 1.229213 0.924719 0.924719 1.451685 1.451685 0.892135 0.892135

1 rows × 35 columns

In [ ]:
reported_cost_renov4.head(1)
Out[ ]:
COMMUNITY_AREA YEAR QUARTER 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 ... 68.0 69.0 70.0 71.0 72.0 73.0 74.0 75.0 76.0 77.0
0 2008 1 0.041919 0.142527 0.079029 0.047343 0.131885 0.135761 0.147981 0.150038 ... 0.030708 0.017003 0.076145 0.094289 0.110724 0.033966 0.057297 0.166993 0.040587 0.085271

1 rows × 79 columns

In [ ]:
renov_final_t=reported_cost_renov4.iloc[0:-7]
renov_final=renov_final_t[areas]
renov_final.head()
Out[ ]:
COMMUNITY_AREA 3.0 1.0 6.0 7.0 4.0 5.0 16.0 14.0 15.0 11.0 ... 68.0 69.0 38.0 41.0 72.0 75.0 71.0 44.0 46.0 53.0
0 0.079029 0.041919 0.135761 0.147981 0.047343 0.131885 0.060267 0.116731 0.210126 0.049650 ... 0.030708 0.017003 0.028022 0.120351 0.110724 0.166993 0.094289 0.159209 0.069464 0.046393
1 0.670424 0.260806 0.363951 0.264998 0.584058 1.950902 0.814462 1.162274 1.438010 1.227525 ... 0.143802 0.191602 1.868217 0.582116 1.750199 0.675497 0.245415 0.377882 0.365320 0.548055
2 0.452126 0.175653 0.621766 0.369282 0.601317 0.965578 0.258266 0.458249 0.345333 0.563529 ... 0.424320 1.899765 0.403069 0.520168 0.453148 0.362276 0.422803 0.108622 0.275831 0.275310
3 0.533570 0.710546 0.646972 0.389183 1.048825 0.334408 0.202372 0.311119 0.267434 0.847954 ... 0.119007 1.322207 0.422614 2.471277 0.522336 1.010354 1.185354 0.655286 1.287131 1.743544
4 0.671016 0.459152 0.556160 0.337366 0.307196 0.763959 0.234350 0.251509 0.490572 0.144482 ... 0.547294 0.323428 0.286429 0.516118 0.024312 0.369397 0.084672 0.093325 0.269063 2.288765

5 rows × 32 columns

In [ ]:
price_final=pr_change_df2.iloc[3:,3:].reset_index(drop=True)
In [ ]:
price_final.shape
Out[ ]:
(54, 32)
In [ ]:
#these are all areas
#reported_cost_renov4.iloc[0:-3,2:].plot(figsize=(30,15),grid=True);
In [ ]:
renov_costs_change=reported_cost_renov4.iloc[0:-3,2:].loc[:,areas]
In [ ]:
renov_costs_change.plot(figsize=(30,15), grid=True)
Out[ ]:
<AxesSubplot:>
In [ ]:
price_change=pr_change_df2.iloc[:,3:]
In [ ]:
pr_change_df2.tail(1)
Out[ ]:
YEAR QUARTER CHICAGO 3 1 6 7 4 5 16 ... 68 69 38 41 72 75 71 44 46 53
56 2022 2 1.0 1.804494 1.804494 0.434831 0.434831 0.905618 0.905618 0.957303 ... 1.422472 1.422472 1.229213 1.229213 0.924719 0.924719 1.451685 1.451685 0.892135 0.892135

1 rows × 35 columns

In [ ]:
price_change.plot(figsize=(20,10),grid=True)
Out[ ]:
<AxesSubplot:>
In [ ]:
#renov_costs_change vs price_change
In [ ]:
renov_costs_change.shape
Out[ ]:
(58, 32)
In [ ]:
price_change.shape
Out[ ]:
(57, 32)
In [ ]:
#price_final vs renov_final
In [ ]:
import seaborn as sns
In [ ]:
sns.scatterplot(x=renov_list,y=price_list)
Out[ ]:
<AxesSubplot:>
In [ ]:
test=pd.DataFrame({'x':renov_list,'y':price_list})
In [ ]:
test.corr()
Out[ ]:
x y
x 1.000000 0.013957
y 0.013957 1.000000
In [ ]:
import statsmodels.api as sm
from scipy.stats import norm
from scipy import stats
In [ ]:
stats.pearsonr(renov_list,price_list)
Out[ ]:
PearsonRResult(statistic=0.01395689006815116, pvalue=0.5620593208882761)
In [ ]:
renov_final.shape
Out[ ]:
(54, 32)
In [ ]:
price_final.shape
Out[ ]:
(5, 32)
In [ ]:
renov_list=[]
for col in renov_final.columns:
    for item in renov_final[col]:
        renov_list.append(item)
In [ ]:
len(renov_list)
Out[ ]:
1728
In [ ]:
price_list=[]
for col in price_final.columns:
    for item in price_final[col]:
        price_list.append(item)
In [ ]:
len(price_list)
Out[ ]:
1728
In [ ]: